library(tidyverse)
library(GGally)
library(astsa)
library(cowplot)
library(zoo)
library(mlr)
The data set is 124,494 rows and 12 columns (1 date, 1 ID, 9 features, and 1 target).
data.set
Convert the date column to from character into date.
data.set %>% mutate(date = as.Date(date, "%Y-%m-%d")) -> data.set
Mark a summary of the data set.
summary(data.set)
date device failure
Min. :2015-01-01 Length:124494 Min. :0.0000000
1st Qu.:2015-02-09 Class :character 1st Qu.:0.0000000
Median :2015-03-27 Mode :character Median :0.0000000
Mean :2015-04-16 Mean :0.0008514
3rd Qu.:2015-06-17 3rd Qu.:0.0000000
Max. :2015-11-02 Max. :1.0000000
attribute1 attribute2 attribute3 attribute4
Min. : 0 Min. : 0.0 Min. : 0.00 Min. : 0.000
1st Qu.: 61284762 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.000
Median :122797388 Median : 0.0 Median : 0.00 Median : 0.000
Mean :122388103 Mean : 159.5 Mean : 9.94 Mean : 1.741
3rd Qu.:183309640 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 0.000
Max. :244140480 Max. :64968.0 Max. :24929.00 Max. :1666.000
attribute5 attribute6 attribute7 attribute8
Min. : 1.00 Min. : 8 Min. : 0.0000 Min. : 0.0000
1st Qu.: 8.00 1st Qu.:221452 1st Qu.: 0.0000 1st Qu.: 0.0000
Median :10.00 Median :249800 Median : 0.0000 Median : 0.0000
Mean :14.22 Mean :260173 Mean : 0.2925 Mean : 0.2925
3rd Qu.:12.00 3rd Qu.:310266 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. :98.00 Max. :689161 Max. :832.0000 Max. :832.0000
attribute9
Min. : 0.00
1st Qu.: 0.00
Median : 0.00
Mean : 12.45
3rd Qu.: 0.00
Max. :18701.00
Heatmap and correlation matrix shows:
ggcorr(data.set[3:12], method = c("everything", "pearson"))
cor(data.set[3:12])
failure attribute1 attribute2 attribute3 attribute4
failure 1.0000000000 0.0019834843 0.052901580 -0.0009484302 0.067398475
attribute1 0.0019834843 1.0000000000 -0.004249916 0.0037014853 0.001835788
attribute2 0.0529015799 -0.0042499163 1.000000000 -0.0026168612 0.146592542
attribute3 -0.0009484302 0.0037014853 -0.002616861 1.0000000000 0.097452164
attribute4 0.0673984749 0.0018357883 0.146592542 0.0974521643 1.000000000
attribute5 0.0022697309 -0.0033758819 -0.013998641 -0.0066964047 -0.009772824
attribute6 -0.0005503238 -0.0015219713 -0.026349593 0.0090270256 0.024869926
attribute7 0.1190545851 0.0001506758 0.141367266 -0.0018839415 0.045630854
attribute8 0.1190545851 0.0001506758 0.141367266 -0.0018839415 0.045630854
attribute9 0.0016215740 0.0011207497 -0.002735715 0.5323664423 0.036068992
attribute5 attribute6 attribute7 attribute8 attribute9
failure 0.002269731 -0.0005503238 0.1190545851 0.1190545851 0.001621574
attribute1 -0.003375882 -0.0015219713 0.0001506758 0.0001506758 0.001120750
attribute2 -0.013998641 -0.0263495934 0.1413672661 0.1413672661 -0.002735715
attribute3 -0.006696405 0.0090270256 -0.0018839415 -0.0018839415 0.532366442
attribute4 -0.009772824 0.0248699258 0.0456308537 0.0456308537 0.036068992
attribute5 1.000000000 -0.0170493871 -0.0093837348 -0.0093837348 0.005949416
attribute6 -0.017049387 1.0000000000 -0.0122068373 -0.0122068373 0.021152492
attribute7 -0.009383735 -0.0122068373 1.0000000000 1.0000000000 0.006860674
attribute8 -0.009383735 -0.0122068373 1.0000000000 1.0000000000 0.006860674
attribute9 0.005949416 0.0211524917 0.0068606743 0.0068606743 1.000000000
Confirm that attribute 7 and attribute 8 are identical:
data.set%>% mutate(diff_7_8 = attribute7 - attribute8) %>% filter(diff_7_8 != 0)
Drop attribute 8 from the data set:
data.set %>% select(-attribute8) -> data.set
In total 1,169 devices have been monitored, each has 1 - 304 records.
data.set %>% group_by(device) %>% summarise(number_of_records = n()) %>% ungroup() %>% arrange(-number_of_records)
data.set %>% group_by(device) %>% summarise(number_of_records = n()) %>% ungroup() %>%select(number_of_records) %>% ggplot(aes(x=number_of_records)) + geom_histogram(binwidth=5) + xlab("Number of Records per Device") + ggtitle("Histogram of Number of Records per Device")
In total 106 devices failed, but all of them only failed once.
data.set %>% filter(failure == 1) %>% group_by(device) %>% summarise(number_of_records = n()) %>% ungroup() %>% arrange(-number_of_records)
Keep a list of failed device and for the convience later on.
data.set %>% filter(failure == 1) %>% select(device) %>% unique() -> failed_device
Each of the failed devices have 5 - 299 records in the data set.
data.set %>% group_by(device) %>% summarise(number_of_records = n()) %>% ungroup() %>% arrange(-number_of_records) %>% inner_join(data.set %>% filter(failure == 1) %>% select(device) ,by = "device")
data.set %>% group_by(device) %>% summarise(number_of_records = n()) %>% ungroup() %>% arrange(-number_of_records) %>% inner_join(data.set %>% filter(failure == 1) %>% select(device) ,by = "device") %>%select(number_of_records) %>% ggplot(aes(x=number_of_records)) + geom_histogram(binwidth=5) + xlab("Number of Records per Failed Device") + ggtitle("Histogram of Number of Records per Failed Device")
*** #### Overview - Missing Record Days during Surveillance
Most of the devices have one record per day during their surveillance period, however, 173 out of the total 1,169 devices have some days without any record in the data set (maximum 144 days, minimum 1 day). 15 of these 173 are failed equipment (maximum 119 days are missing, minimum 2 days).
data.set %>% group_by(device) %>% summarise(number_of_records = n(), number_of_days_during_surveillance = as.integer(max(date) - min(date) + 1)) %>% ungroup() %>% mutate(missing_record_days = number_of_days_during_surveillance - number_of_records) %>% filter(missing_record_days >0 )%>%select(device, missing_record_days)%>%arrange(-missing_record_days) -> devices_missing_records
devices_missing_records
All failed devices failed on their last day of their surveillance, except 5 of them. It could be they get repaired.
data.set %>% group_by(device) %>% summarise(last_record_date = max(date), first_record_data = min(date)) %>% ungroup() -> record_date
failed_device %>% left_join(record_date, by = "device") %>% select(device, last_record_date) %>% left_join(data.set, by = c("device" = "device", "last_record_date" = "date")) %>% filter(failure == 0) %>% left_join(data.set %>% filter(failure == 1) %>% select(device,date), by ="device") %>% rename(date_of_failure = date) %>% select(device,date_of_failure,last_record_date)
Keep a list of the 5 devices for potential usage later, and rename devices to “DeviceID_repaired” after they are repaired (i.e. regard repaired device as new one).
failed_device %>% left_join(record_date, by = "device") %>% select(device, last_record_date) %>% left_join(data.set[1:3], by = c("device" = "device", "last_record_date" = "date")) %>% filter(failure == 0) %>% left_join(data.set %>% filter(failure == 1) %>% select(device,date), by ="device") %>% rename(date_of_failure = date) %>% select(1,2,4) -> repaired_devices
repaired_devices %>% left_join(data.set, by = "device") %>% mutate(device = ifelse(date <= date_of_failure, device, paste0(device,"_repaired"))) %>% select(1,4:13) -> data_split_repaired_device
Pick some devices to plot how their features change with time in order to have some general feelings.
data.set%>% filter(device == "S1F0E9EP") %>% gather(all_attribute, value, starts_with('attribute')) %>%
ggplot(aes(date, value)) + geom_line()+ geom_point(size = 0.5) +
facet_wrap(~ all_attribute, scales = 'free_y')
According the plots above, attribute 2 - 9 may monotonically increase with time, 9 even can be constant (e.g. perhaps a categorical variable). After a check: Only features 3,4,6,9 are monotonical increasing, and feature 9 is not constant, though only varies for 45 devices (include 8 failed devices). If regarding a repaired device as a new device, the conclusion stands too (verifying code for this part was not included).
data.set %>% group_by(device) %>% summarise(last_record_date = max(date), first_record_data = min(date)) %>% ungroup() -> record_date
for (val in c(2:7,9)){
record_date %>% inner_join(data.set, by = c("device" = "device","last_record_date" = "date")) %>% select(c(device, paste0("attribute",val))) -> last_record
data.set %>% group_by(device) %>% summarise_at(paste0("attribute",val),max) -> max_record
if(all.equal(last_record,max_record%>%mutate_if(is.numeric,as.integer)) == TRUE){
print(paste0("attribute",val," monotonically increases."))
}else{
print(paste0("attribute",val," is not monotonical."))
}
}
[1] "attribute2 is not monotonical."
[1] "attribute3 monotonically increases."
[1] "attribute4 monotonically increases."
[1] "attribute5 is not monotonical."
[1] "attribute6 monotonically increases."
[1] "attribute7 is not monotonical."
[1] "attribute9 monotonically increases."
There are 530 devices with ID starting with S1 (42 failed), 420 devices with ID starting with W1 (42 failed), 219 devices with ID starting with Z1 (22 failed). The proportion of failed device among S1 is silightly lower than among Z1 and W1.
data.set %>% select(device) %>%unique() %>% mutate(mode = substr(device,1,2)) %>% group_by(mode) %>% summarise(number_of_device = n()) %>% left_join(
data.set %>% filter(failure == 1) %>%select(device) %>%unique() %>% mutate(mode = substr(device,1,2)) %>% group_by(mode) %>% summarise(number_of_failed_device = n()),
by = "mode") %>%
mutate(failed_device_percentage = paste0(round(number_of_failed_device/number_of_device*100,2),"%")) -> table
table
There comes a small hypothesis testing, with the null hypothesis: S1, W1, Z1 devices fail follow the same distribution. Chi-squred test gives a p-value = 0.5263 suggest that the null hypothesis shouldn’t be rejected, therefore it is assumed that S1, W1, Z1 doesn’t indicates different designs which impact device performance (in reality, asking domain experts is a more direct way).
chisq.test(table[,2:3])
Pearson's Chi-squared test
data: table[, 2:3]
X-squared = 1.2837, df = 2, p-value = 0.5263
Before looking into each feature, creating a density function to make plotting density easier later on.
ggplot_density_function <- function(data.set, var, x_title = "", remove_outlier = TRUE){
data.set %>% mutate(failure = as.factor(failure)) -> data.set
if(remove_outlier == TRUE){
outliers<- boxplot(data.set[,var], plot = FALSE)$out
data.set <- data.set%>% filter(!(data.set[,var] %in% outliers))
}
plot <- ggplot(data.set, aes(data.set[,var], fill=failure)) +
geom_density(alpha=.5) + labs(x = x_title)+ theme(legend.position="top")+
scale_fill_manual(values = c('#999999','#E69F00'))
return(plot)
}
Plot Attribute1 for one example “W1F0T0B1” as time series.
data.set%>% filter(device == "W1F0T0B1") %>%
ggplot(aes(date, attribute1)) + geom_line()
The correlogram shows there is not so much correlation between all the different lags.
acf(data.set[data.set$device == "W1F0T0B1","attribute1"], plot = TRUE)
data.set %>% ggplot_density_function(var = "attribute1", remove_outlier = FALSE)
Attribute2 is highly right skewed, with 558 different values which varies a lot (density function on the left, boxplot on the left).
plot_grid(data.set %>% ggplot_density_function(var = "attribute2", remove_outlier = FALSE),
data.set %>% ggplot(aes(y=attribute2)) + geom_boxplot() + theme(axis.line=element_blank(),axis.text.x=element_blank(),
axis.text.y=element_blank(),axis.ticks=element_blank()), nrow = 1,ncol = 2,rel_widths = c(3/4, 1/4))
However, 118,110 out of 124,494 records have attribute 2 equal to 0, after removed all the 0s, the density distribution and boxplot looks like in below.
data.set %>% filter(attribute2 != 0) %>% ggplot_density_function(var = "attribute2", remove_outlier = FALSE)
Density function after logarithm:
data.set %>% filter(attribute2 != 0) %>% mutate(attribute2_log = log(attribute2)) %>% ggplot_density_function(var = "attribute2_log", remove_outlier = FALSE)
Attribute 3 only has 47 different values, 115,359 out of 124,494 records is 0.
data.set %>% filter(attribute3 != 0) %>% ggplot_density_function(var = "attribute3", remove_outlier = FALSE)
Density function after logarithm is still right skewed:
data.set %>% filter(attribute3 != 0) %>% mutate(attribute3_log = log(attribute3)) %>% ggplot_density_function(var = "attribute3_log", remove_outlier = FALSE)
Similar to Attribute 2 and 3, in most of the records (115,156) Attribute 4 is 0, the value of the rest varies a lot.
data.set %>% filter(attribute4 != 0) %>% ggplot_density_function(var = "attribute4", remove_outlier = FALSE)
Attribute 4 after logarithm.
data.set %>% filter(attribute4 != 0) %>% mutate(attribute4_log = log(attribute4)) %>% ggplot_density_function(var = "attribute4_log", remove_outlier = FALSE)
Attribute 5 has 60 different values in [1, 98].
data.set %>% ggplot_density_function(var = "attribute5", remove_outlier = FALSE)
Attribute 6 varies from 8 to 689,161.
data.set %>% ggplot_density_function(var = "attribute6", remove_outlier = FALSE)
Attribute 7 has 123,036 0s, the rest are in [6, 832].
data.set %>% filter(attribute7 != 0) %>% ggplot_density_function(var = "attribute7", remove_outlier = FALSE)
Attribute 7 after logarithm:
data.set %>% filter(attribute7 != 0) %>% mutate(attribute7_log = log(attribute7)) %>% ggplot_density_function(var = "attribute7_log", remove_outlier = FALSE)
In 97,358 records, attribute 9 is 0. The rest varies in [1, 18701], with only 65 values.
data.set %>% filter(attribute9 != 0) %>% ggplot_density_function(var = "attribute9", remove_outlier = FALSE)
Attribute 9 after logarithm:
data.set %>% filter(attribute9 != 0) %>% mutate(attribute9_log = log(attribute9)) %>% ggplot_density_function(var = "attribute9_log", remove_outlier = FALSE)
It is not possible to confirm here (as need to ask who in charge of these devices), but we assume that days in operation can be approximated by days in surveillance.
Mark the five repaied devices as a new device after repaired:
data_split_repaired_device %>% rbind(data.set%>% filter(!(device %in% unique(repaired_devices$device)))) -> data.set
Get when the devices started in operation (surveillance):
data.set %>% group_by(device) %>% summarise(start_date = min(date)) %>% ungroup() -> started_in_operation
Caluculate days in operation (surveillance) and add to the data set.
data.set %>% left_join(started_in_operation, by = "device") %>% mutate(days_in_operation = as.integer(date - start_date + 1)) %>% select(-start_date) -> data.set
However, only weak linear relationships exist between days in operations and features and target.
ggcorr(data.set[3:12], method = c("everything", "pearson"))
Date feature potentially tells some information about at which time of the year the devices may have higher failure rate. It can be related to temperature etc.
data.set %>% mutate(month = substr(date,6,7)) %>% group_by(month) %>% summarise(number_of_records = n()) %>%
left_join(
data.set %>% filter(failure == 1) %>% mutate(month = substr(date,6,7)) %>% group_by(month) %>% summarise(number_of_failures = n()),
by = "month"
) %>% mutate(number_of_failures= replace_na(number_of_failures, 0)) %>% mutate(failure_percentage = paste0(round(number_of_failures/number_of_records*100,3),"%")) -> table
table
Chi-squared test rejects the null hypothesis that the failure rate of the device is the same through all the months in 2015. Therefore, suggest that certain months can have higher (or lower) failure rate.
chisq.test(table[,2:3], simulate.p.value = TRUE)
Pearson's Chi-squared test with simulated p-value (based on 2000
replicates)
data: table[, 2:3]
X-squared = 29.66, df = NA, p-value = 0.02499
Add a categorical feature “month” into the data set.
data.set %>% mutate(month = as.factor(substr(date,6,7))) -> data.set
Use the pacakge “mlr” create a classification task.Take a small part of the data set as hold out set (random 15 failure records + random 2000 non-failure records), the rest will be used for training and cross validation.
set.seed(5)
data.set %>% arrange(-failure,device) -> data.set
hold.out = c(sample(106, 15), sample(nrow(data.set) - 106, 2000) + 106)
quick.test.task = makeClassifTask(id = "Quick_Test", data = data.set[3:11], target = "failure", positive = 1) %>% createDummyFeatures()
quick.test.task.cross.validation = subsetTask(quick.test.task, subset = setdiff(1:nrow(data.set), hold.out))
quick.test.task.hold.out = subsetTask(quick.test.task,subset = hold.out)
The data set is imbalanced, here uses hybrid method of mixing over sampling and under sampling. The number 0 and 1 samples in the cross validation data set are close to each after the over sampling and under sampling. Adjust the over/under sampling rates will impact False Postive Rate and False Negative Rate in results, which can be tunned based on needs.
quick.test.task.cross.validation.over = oversample(quick.test.task.cross.validation, rate = 40)
quick.test.task.cross.validation.over.under = undersample(quick.test.task.cross.validation.over, rate = 1/30)
table(getTaskTargets(quick.test.task.cross.validation.over.under))
0 1
4080 3640
Based on the Data Explore Analysis, choose a tree based model instead of regression or SVM based algorithms, due to the facts:
Use Gradient Boosting Machine for the first test.
lnr.simple<- mlr::makeLearner("classif.gbm", predict.type = "prob", distribution = "bernoulli")
learningCurve <- mlr::generateLearningCurveData(learners = lnr.simple,
quick.test.task.cross.validation.over.under,
makeResampleDesc(method = "CV", iters = 5 ,predict = "both"),
seq(0.1, 1, by = 0.1),
measures = list(setAggregation(auc, test.mean),setAggregation(auc, train.mean)),show.info = FALSE)
plotLearningCurve(learningCurve, facet = "learner")
r = resample(lnr.simple,
quick.test.task.cross.validation.over.under, makeResampleDesc(method = "CV", iters = 5 ,predict = "both"), measures = list(fpr, fnr))
r$aggr
fpr.test.mean fnr.test.mean
0.08133404 0.23175177
df = generateThreshVsPerfData(r, measures = list(fpr, tpr, mmce))
plotROCCurves(df)
plotThreshVsPerf(df)
The model performs well when measured by aggregated train/test AUC and aggregated train/test False Positive Rate (FPR) given the default threshold (0.5), but relatively poor False Negative Rate (FNR).
When testing with the hold out data set, the model performan fairly stable.
mod = train(lnr.simple, quick.test.task.cross.validation.over.under)
task.pred = predict(mod, task = quick.test.task.hold.out)
performance(task.pred, measures = list(fpr,fnr))
fpr fnr
0.09 0.20
According to the Data Explory Analysis, it feels that the changes on certain features may be usual indicators (e.g. attribute 2, 4, 7 etc.).
Add the difference of attribute 2,4,7 between the observed date and 1 - 3 days ealier (below is the code for add one day difference in attribute 2).
data.set %>% arrange(device, date)%>%mutate(pre_attribute2 = lag(attribute2,1), pre_device = lag(device,1), pre_date = lag(date,1)) %>% mutate(attribute2_d1 = ifelse(device == pre_device & as.integer(date - pre_date) == 1, attribute2 - pre_attribute2 , 0)) %>% mutate(attribute2_d1 = replace_na(attribute2_d1, 0)) %>% select(1:13,attribute2_d1) -> data.set
Create a new task based on the data set with additional features
data.set %>% arrange(-failure,device) -> data.set
add_features.task = makeClassifTask(id = "add_features", data = data.set[3:22], target = "failure", positive = 1) %>% createDummyFeatures()
add_features.task.cross.validation = subsetTask(add_features.task, subset = setdiff(1:nrow(data.set), hold.out))
add_features.task.hold.out = subsetTask(add_features.task,subset = hold.out)
add_features.task.cross.validation.over = oversample(add_features.task.cross.validation, rate = 40)
add_features.task.cross.validation.over.under = undersample(add_features.task.cross.validation.over, rate = 1/30)
learningCurve <- generateLearningCurveData(learners = lnr.simple,
add_features.task.cross.validation.over.under,
makeResampleDesc(method = "CV", iters = 5 ,predict = "both"),
seq(0.1, 1, by = 0.1),
measures = list(setAggregation(auc, test.mean),setAggregation(auc, train.mean)),show.info = FALSE)
plotLearningCurve(learningCurve, facet = "learner")
r = resample(lnr.simple,
add_features.task.cross.validation.over.under, makeResampleDesc(method = "CV", iters = 5 ,predict = "both"), measures = list(fpr, fnr))
r$aggr
fpr.test.mean fnr.test.mean
0.05735857 0.20364748
Adding features improved AUC by decreasing FPR, however, doesn’t help FNR.
mod = train(lnr.simple, add_features.task.cross.validation.over.under)
task.pred = predict(mod, task = add_features.task.hold.out)
performance(task.pred, measures = list(fpr, fnr))
fpr fnr
0.0815000 0.2666667
Adding features improved AUC and decreases FPR, however, doesn’t help FNR.
Check the few wrongly predicted examples for the hold out data set to see what’s going on here and if can find any clue:
data.set[1:12]%>% filter(device == "W1F19BPT") %>% gather(all_attribute, value, starts_with('attribute')) %>%
ggplot(aes(date, value)) + geom_line() + geom_point(size = 0.5) + geom_vline(xintercept = data.set[data.set$device == "W1F19BPT" & data.set$failure == 1, "date"],
color = "blue", size=0.5)+
facet_wrap(~ all_attribute, scales = 'free_y')
data.set[1:12]%>% filter(device == "W1F1230J") %>% gather(all_attribute, value, starts_with('attribute')) %>%
ggplot(aes(date, value)) + geom_line() + geom_point(size = 0.5) + geom_vline(xintercept = data.set[data.set$device == "W1F1230J" & data.set$failure == 1, "date"],
color = "blue", size=0.5)+
facet_wrap(~ all_attribute, scales = 'free_y')
data.set[1:12]%>% filter(device == "S1F11MB0") %>% gather(all_attribute, value, starts_with('attribute')) %>%
ggplot(aes(date, value)) + geom_line() + geom_point(size = 0.5) + geom_vline(xintercept = data.set[data.set$device == "W1F19BPT" & data.set$failure == 1, "date"],
color = "blue", size=0.5)+
facet_wrap(~ all_attribute, scales = 'free_y')
One of them missing data, one only have Attribute 1 varies with time, one only have Attribute 1 and Attribute 6 varies with time. - Something to work on in the future.
Benchmark three tree based model with their default hyperparameter settings: Gradient Boosting Machine (GBM), Random Forrest, eXtreme Gradient Boosting (XGBoost).
lrns = list(makeLearner("classif.gbm", predict.type = "prob", distribution = "bernoulli"), makeLearner("classif.randomForest", predict.type = "prob"),
makeLearner("classif.xgboost", predict.type = "prob"))
bmr = benchmark(lrns, add_features.task.cross.validation.over.under, makeResampleDesc("CV", iters = 5), measures = list(auc, fnr, fpr), models = TRUE)
[22:08:27] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:08:27] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:08:27] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:08:27] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:08:27] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Random Forest gives the best performance measured the aggregated test results from cross-validation. However, it gives the poorest performance on hold out data set which suggests high variance, while the default GBM give more stable performance.
Random Forest
mod = train(makeLearner("classif.randomForest", predict.type = "prob"), add_features.task.cross.validation.over.under)
task.pred = predict(mod, task = add_features.task.hold.out)
performance(task.pred, measures = list(fnr, fpr))
fnr fpr
0.8000 0.0125
Continue with GBM for some simple feature selection and hyperparameter tunning. Tunning other models and benchmark the performance can be a work in the future.
Here integrate feature selection with hyperparameter tuning by setting the percentage of features to be kept as a hyperparameter. Besides it, number of trees, learning rate, and number of splits it has to perform on a tree are also tuned for GBM, by randomly choosing points within the space according to the specified bounds maximum 10 times.
lrn = makeFilterWrapper(learner = makeLearner("classif.gbm", predict.type = "prob", distribution = "bernoulli"), fw.method = "chi.squared")
ps = makeParamSet(makeDiscreteParam("fw.perc", seq(0.25,1,0.25)),
makeDiscreteParam("n.trees", seq(50,200,50)),
makeNumericParam("shrinkage", lower = 0.001, upper = 0.1),
makeDiscreteParam("interaction.depth", values = c(1,6))
)
rdesc = makeResampleDesc("CV", iters = 5)
res = tuneParams(lrn, task = add_features.task.cross.validation.over.under, resampling = rdesc, par.set = ps, measures = auc,
control = makeTuneControlRandom(maxit = 10))
Tune results significantly improves the model performance measured by aggregated test results from cross validation.
tuned_gbm <- makeFilterWrapper(learner = makeLearner("classif.gbm", predict.type = "prob", distribution = "bernoulli", shrinkage=0.09, interaction.depth=6), fw.perc = 0.5, fw.method = "chi.squared")
r = resample(tuned_gbm,
add_features.task.cross.validation.over.under, makeResampleDesc(method = "CV", iters = 5 ,predict = "both"), measures = list(auc,fpr, fnr))
r$aggr
auc.test.mean fpr.test.mean fnr.test.mean
0.99451293 0.04126802 0.01164308
df = generateThreshVsPerfData(r, measures = list(fpr, tpr, mmce))
plotROCCurves(df)
plotThreshVsPerf(df)
For hold out data set, the model trained with tuned hyperparameters reduces FPR. Not surprisely, it still can not classify the 3 failed devices with relatively poor data quality correctly. Improvment and fine tunning are work for the future.
mod = train(tuned_gbm, add_features.task.cross.validation.over.under)
task.pred = predict(mod, task = add_features.task.hold.out)
performance(task.pred, measures = list(fnr, fpr))
fnr fpr
0.2666667 0.0550000